### -------------------------------------------------- 
### ---- General Notes ---- 
### -------------------------------------------------- 

# [Add Notes]


### -------------------------------------------------- 
### ---- General Set Up ---- 
### -------------------------------------------------- 

### Clear global environment
rm(list=ls()) 


### Libraries:
library(pacman)
p_load(tidyverse, foreign, readstata13, readr,
       magrittr, here)


### File Location (R project only)
here::i_am("Survey Only Match/102_match_tfa_to_state_uniname.R")
file_path <- here()

### -------------------------------------------------- 
### ---- Download TFA Data ----
### -------------------------------------------------- 

### Download Data
tfa_dat <- read.dta13(paste0(file_path,"/Data/tfa_data.dta"))


### -------------------------------------------------- 
### ---- Read/Generate Lists of University Names & ----
###     Locations to Merge
### -------------------------------------------------- 

### -------------------------------------------------- 
### Generate list of unique universities in TFA dataset 
### ("undergraduateuniversity") 
### -------------------------------------------------- 
unique_university <- dplyr::distinct(tfa_dat["undergraduateuniversity"]) %>%
  # Add variable indicating whether university name 
  # is found in TFA file 
  dplyr::mutate(in_tfa = 1) 

### -------------------------------------------------- 
### Read List of Universities in USA as uni_list
### (downloaded 4/2017 file on 3/23/2018)
### -------------------------------------------------- 

### Source: https://ope.ed.gov/accreditation/GetDownloadFile.aspx 
uni_list <- read_csv(paste0(file_path,"/Data/Accreditation_04_2017.csv"),
                     col_names = TRUE, na = c("", "NA")) %>%
  # Rename University Name Variable to match TFA dataset
  dplyr::rename(undergraduateuniversity = Institution_Name) %>%
  # Create variable to track successful matches
  dplyr::mutate(matches_list = 1) %>% 
  # Create variable with institution phone number area code
  dplyr::mutate(area_code = str_sub(Institution_Phone, start=1, end=3))


### -------------------------------------------------- 
### Subset uni_list to Relevant Location Info
### -------------------------------------------------- 

###Subset
uni_list <- uni_list %>% 
  dplyr::distinct(undergraduateuniversity, Institution_State, Campus_State, matches_list)


### -------------------------------------------------- 
### ---- Evaluating Match (Part I) ----
### -------------------------------------------------- 

### TFA universities matched to uni_list (used to determine TFA schools w/out match)
unique_uni_match <- left_join(x = unique_university,
                              y = uni_list,
                              by = "undergraduateuniversity") 


### Remaining Missing Values
# Original Number of Unique University Names
len_unique_uni <- length(unique_university[,1]) #= 1710
unique_uni_no_match_i <- sum(is.na(unique_uni_match[, "Institution_State"])) # = 490
unique_uni_no_match_c <- sum(is.na(unique_uni_match[, "Campus_State"])) %>% print() # = 1745
unique_uni_no_match <- sum(is.na(unique_uni_match[, "matches_list"])) # 490


### -------------------------------------------------- 
### ---- Evaluating Match (Part II) ----
### -------------------------------------------------- 

### uni_list matched to TFA universities (used to determine schools in uni_list w/ 2+ states)
uni_list_tfa <- left_join(x = uni_list,
                          y = unique_university,
                          by = "undergraduateuniversity")

### College names in tfa_dat and uni_list with INSTUTUTIONS in MULTIPLE states
unimatch_dup_institution <- uni_list_tfa %>% 
  dplyr::distinct(undergraduateuniversity, Institution_State, in_tfa) %>%
  dplyr::filter(( duplicated(undergraduateuniversity,fromLast=F) | 
                    duplicated(undergraduateuniversity,fromLast=T) ) & 
                  (in_tfa == 1)) %>%
  dplyr::arrange(undergraduateuniversity) %>% 
  print()

# Summarize: Number of INSTUTUTIONS in Different States  
unimatch_dup_institution_sum <- unimatch_dup_institution %>% 
  group_by(undergraduateuniversity) %>% 
  summarize(no_inst = n()) %>%
  arrange(desc(no_inst)) %>%
  print()

### College names in tfa_dat and uni_list with CAMPUSES in MULTIPLE states
unimatch_dup_campus <- uni_list_tfa %>% 
  dplyr::filter(!is.na(Campus_State)) %>%
  dplyr::distinct(undergraduateuniversity, Campus_State, in_tfa) %>%
  dplyr::filter(( duplicated(undergraduateuniversity,fromLast=F) | 
                    duplicated(undergraduateuniversity,fromLast=T) ) & 
                  (in_tfa == 1)) %>%
  dplyr::arrange(undergraduateuniversity) %>% 
  print()

# Summarize: Number of CAMPUSES in Different States  
unimatch_dup_campus_sum <- unimatch_dup_campus %>% 
  group_by(undergraduateuniversity) %>% 
  summarize(no_campuses = n()) %>%
  arrange(desc(no_campuses)) %>% 
  print()

#---
# Findings: 
# - Process: matched unique college names from TFA file to name, "Institution_State" 
#   and "Campus_State" in accredidation file 
# - Determined how many unique college names in the TFA dataset:
#   (0) Exist                                             = 1710  (len_unique_uni)
#   (1) Do NOT match any names in the accreditation file  = 490   (unique_uni_no_match)
#   (2) Do NOT have valid Institution_State data          = 490   (unique_uni_no_match_i)
#   (3) Do NOT have valid Campus_State data               = 1745  (unique_uni_no_match_c)
          # college names are counted twice if they have multiple values for 
          # campus state, leading to a total N of 2949
#   (4) Have MULTIPLE Institution_State options           = 26    (unimatch_dup_institution_sum)
#   (5) Have MULTIPLE Campus_State options                = 175   (unimatch_dup_campus_sum)
#---


### -------------------------------------------------- 
### ---- Update University Dataset to Fix ----
###      Universities in 2+ States
### -------------------------------------------------- 

### Create flags for multiple insitution / campus state options
multi_name_c <- unimatch_dup_campus_sum %>%
  dplyr::mutate(campus_multi = 1) 
multi_name_i <- unimatch_dup_institution_sum %>%
  dplyr::mutate(inst_multi = 1) 
multi_name <- left_join(multi_name_c,
                        multi_name_i,
                        by = "undergraduateuniversity")

### Merge with full uni_list dataset
uni_list_final <- left_join(x = uni_list,
                            y = multi_name,
                            by = "undergraduateuniversity") %>%
  #for inst./campus with only 1 match, set _State2 variable to the exclusive match
  #for multiple matches, leave _State2 blank
  dplyr::mutate(Institution_State2 = ifelse(is.na(inst_multi), Institution_State, NA)) %>%
  dplyr::mutate(Campus_State2 = ifelse(is.na(campus_multi), Campus_State, NA)) 


### -------------------------------------------------- 
### ---- Join TFA list with uni_list data ----
### -------------------------------------------------- 

### Merge tfa data wtih full uni_list dataset (with flags for multiple states)
tfa_dat_unimatch <- left_join(x = tfa_dat,
                              y = uni_list_final,
                              by = "undergraduateuniversity") 


### -------------------------------------------------- 
### ---- Evaluating Match (Part III) ----
### -------------------------------------------------- 


#---
# Findings: 
# - Determined how many college names in the TFA dataset were matched:
#   (0) Total Obs in TFA Dataset (obs_total)                  = 120,417 (100.0%)
#   (1) Total Obs w. 2+ Institution States (obs_uni_multi_i)  = 304     (  0.3%)
#   (2) Total Obs Not Matched* (obs_uni_none_i)               = 35,898  ( 29.8%)
#   (3) Total Obs Matched* (obs_uni_good_i)                   = 84,215  ( 69.9%) 
#   (4) Total Obs w. 2+ Campus States (obs_uni_multi_c)       = 15,177  ( 12.6%)
#
#   *Misc. Note: In general, more of the campus states are missing. It seems safe to assume 
#   that if we have one insitution state and no campus state, the campus is in the same 
#   state as the insitution. Thus, I have used the institution variable to calculate 
#   the number of obs still missing. 
#---


### -------------------------------------------------- 
### ---- Loop through state names ----
### -------------------------------------------------- 

### Loop through List of University Names in TFA dataset
data(state)
est_state_all <- rep(NA, length(unique_university[,1]))
for(name in seq_along(state.name)){
  for(i in seq_along(unique_university[,1])){
    if(grepl(state.name[name], unique_university[i,"undergraduateuniversity"]))
      est_state_all[i] <- state.abb[name]
  }
}

### Merge Estimated States w/ University Names in TFA dataset
unique_uni_state <- unique_university %>% 
  select(undergraduateuniversity) %>%
  dplyr::mutate(name_state = est_state_all) #%>% print()



### -------------------------------------------------- 
### ---- Join TFA university list with est. ----
#        university state dataset
### -------------------------------------------------- 

### Merge tfa data wtih full uni_list dataset (with flags for multiple states)
tfa_dat_unimatch2 <- left_join(tfa_dat_unimatch,
                               unique_uni_state,
                               by = "undergraduateuniversity") %>% 
  dplyr::mutate(same_state = ifelse(name_state==Institution_State2, 1, 0))


### -------------------------------------------------- 
### ---- Improving State Match (name_state) ----
### -------------------------------------------------- 

### Fix Name Matches to Washington State
tfa_dat_unimatch2 <- tfa_dat_unimatch2 %>%
  mutate(name_state2 = ifelse(name_state=="University of Washington - Seattle", "WA", name_state)) %>%
  mutate(name_state2 = ifelse(name_state=="Washington University in St. Louis", "MO", name_state)) %>%
  mutate(name_state2 = ifelse(name_state=="George Washington University", "DC", name_state)) %>%
  mutate(name_state2 = ifelse(name_state=="Washington and Jefferson College", "PA", name_state)) %>%
  mutate(name_state2 = ifelse(name_state=="University of Washington - Tacoma", "WA", name_state)) %>%
  mutate(name_state2 = ifelse(name_state=="DeVry University - Washington", "WA", name_state)) %>%
  mutate(name_state2 = ifelse(name_state=="University of Washington - Bothell", "WA", name_state)) %>%
  mutate(name_state2 = ifelse(name_state=="Washington Bible College/Capital Bible Seminary", "MD", name_state)) %>%
  mutate(name_state2 = ifelse(name_state=="Fairhaven College at Western Washington University", "WA", name_state))

### Fix Name Matches to Other States (Based on quick subjective scan of name_state)
tfa_dat_unimatch2 <- tfa_dat_unimatch2 %>%
  mutate(name_state2 = ifelse(name_state=="Delaware Valley College", "PA", name_state)) %>%
  mutate(name_state2 = ifelse(name_state=="Penn State Delaware County", "PA", name_state)) 

### Identify Unmatched University Names
tfa_dat_unimatch2 %>% 
  filter(is.na(name_state)&is.na(Institution_State2)) %>% 
  distinct(undergraduateuniversity, .keep_all = TRUE) %>% 
  group_by(inst_multi) %>%
  summarize(count=n())

### Export list of Unmatched University Names (Uncomment to Export)
# unmatched_uni <- tfa_dat_unimatch2 %>% 
#   filter(is.na(name_state)&is.na(Institution_State2)&is.na(inst_multi)) %>% 
#   group_by(undergraduateuniversity) %>%
#   summarize(count=n()) %>%
#   arrange(desc(count)) %>%
#   mutate(state_handcoded = NA) %>%
#   mutate(multiple_states= NA)
# write.csv(unmatched_uni, file=paste0(file_path_data_matched,"/unmatched_uniname.csv"))
  # then hand-coded locations of these universities

### -------------------------------------------------- 
### ---- Combine and create state_uni ----
### -------------------------------------------------- 

tfa_dat_unimatch3 <- tfa_dat_unimatch2 %>%
  #Remove extra rows for applicants (see note about merging above)
  distinct(personid, .keep_all = TRUE) %>%
  #Remove Old Variables (not accurate since rows removed)
  select(-Institution_State, -Campus_State) %>%
  #First, prioritize Institution_State2
  mutate(state_uni = Institution_State2) %>%
  #Second, if no match, use name_state2 
  mutate(state_uni = ifelse(is.na(Institution_State2), name_state2, state_uni))

#
# Findings:
# (0) Obs w/ no State Match (tfa_dat_unimatch3_missing[2])           = 9,428   ( 7.8%)
# (1) Matched Obs w/ 2+ Campus Match (tfa_dat_unimatch3_multi[1,2])  = 14,873  (12.4%)
#
#---

### -------------------------------------------------- 
### ---- Merge w/ Hand Coded Data ---- 
### -------------------------------------------------- 

### Download Handcoded Data
uni_handcoded <- read_csv(paste0(file_path,"/Data/unmatched_uniname.csv"),
                          col_names = TRUE, na = c("", "NA")) %>%
  select(undergraduateuniversity, campus_multi_hc, state_handcoded, no_campuses_hc) %>%
  mutate(source_hc = 1)

### Merge with tfa_dat_unimatch3
tfa_dat_unimatch4 <- left_join(tfa_dat_unimatch3,
                               uni_handcoded,
                               by = "undergraduateuniversity") 

### Clean tfa_dat_unimatch4
tfa_dat_unimatch4 %<>%
  mutate(campus_multi = ifelse(campus_multi_hc==1 & is.na(campus_multi), campus_multi_hc, campus_multi) ) %>%
  mutate(no_campuses = ifelse(no_campuses_hc==1 & is.na(no_campuses), no_campuses_hc, no_campuses) ) %>%
  mutate(state_uni = ifelse(!is.na(state_handcoded), state_handcoded, state_uni) ) %>%
  mutate(uni_state_source = NA) %>%
  mutate(uni_state_source = ifelse(source_hc==1, "handcoded", uni_state_source) ) %>%
  mutate(uni_state_source = ifelse(!is.na(name_state), "name", uni_state_source) ) %>%
  mutate(uni_state_source = ifelse(!is.na(Institution_State2), "accreditation", uni_state_source) ) 
  
           


### -------------------------------------------------- 
### ---- Saved Matched Data ---- 
### -------------------------------------------------- 

tfa_dat_unimatch_FINAL <- tfa_dat_unimatch4

save(tfa_dat_unimatch_FINAL, 
     file=paste0(file_path,"/Survey Only Match/Temp_Data/tfa_to_state_uniname.RData"))


